In [1]:
# Read allophant results:
import pickle

result_files = {
    "en": "output/allophant_en_sr16000_ipa.pkl",
    "es": "output/allophant_es_sr16000_ipa.pkl",
}

results = {}
for lang, file in result_files.items():
    with open(file, "rb") as f:
        results[lang] = pickle.load(f)
In [2]:
# Show one example of each language:
for lang, data in results.items():
    print(f"Language: {lang}")
    for wav_file, res in data.items():
        print(f"Wav: {wav_file}")
        print(f"Result: {res}")
        break
Language: en
Wav: input_audios/output/words_wav/en/sr16000/rollback_en1075135_en.Jamaica.wav
Result: ['r', 'o̞ː', 'l', 'b', 'æ', 'k']
Language: es
Wav: input_audios/output/words_wav/es/sr16000/baratija_es023913_es.Castellano.wav
Result: ['b', 'a', 'ɾ', 'a', 't̪', 'i', 'x', 'a']
In [3]:
type(res)
Out[3]:
list
In [4]:
# parse results:
import pandas as pd
from pathlib import Path

    
def parse_file_name(file_name):
    """Extract data from file_name 
    e.g. desde_es061900_es.Argentina.wav -> desde (word), es061900 (id), es.Argentina (accent)
    """
    parts = file_name.split('_')
    word = parts[0]
    id_ = parts[1]
    accent = '_'.join(parts[2:]).replace('.wav', '')
    return word, id_, accent



# Make df with cols: lang, accent, word, phone_list, id, wav_file:
def make_df(results):
    data = []
    for lang, lang_data in results.items():
        for wav_file, res in lang_data.items():
            file_name = Path(wav_file).stem
            word, id_, accent = parse_file_name(file_name)
            phones = res
            data.append({
                'lang': lang,
                'accent': accent,
                'word': word,
                'phone_list': phones,
                'id': id_,
                'wav_file': f"{file_name}.wav",
            })
    return pd.DataFrame(data)

df_phones = make_df(results)
In [5]:
df_phones.sample(6)
Out[5]:
lang accent word phone_list id wav_file
17938 en en.Australian tony [t, ɑ, n, i] en1090440 tony_en1090440_en.Australian.wav
98634 en en.uk.Yorkshire diversification [d, æʌ̯, v, ə, s, ɪ, f, ɪ, k, e, ʃ, ə, n] en1026122-55 diversification_en1026122-55_en.uk.Yorkshire.wav
42831 en en.us.south fifth [p, eː, θ] en033792 fifth_en033792_en.us.south.wav
192146 es es.Mexico falencia [f, ə, l, ɛ, n, s̪̻, i, a] es087739 falencia_es087739_es.Mexico.wav
29690 en en.Jamaica splurge [s, p, l, e̞, d̠ʒ] en082603 splurge_en082603_en.Jamaica.wav
116131 en en.scot overcharge [o, v, ɚː, t̠ʃ, ɑ, ɹ, d̠ʒ] en1061859 overcharge_en1061859_en.scot.wav
In [6]:
# df_phones.query("word.str.endswith('pivot') & lang == 'en'").sample(6)
df_phones.query("word.str.endswith('r') & lang == 'es'").sample(6)
Out[6]:
lang accent word phone_list id wav_file
185478 es es.Mexico catalizador [e̞, t̪, a, l, ɪ, z̪, a, ð, o, ɾ] es036184 catalizador_es036184_es.Mexico.wav
202354 es es.Argentina denegar [d̪, e, n, i, ɣ, ɑ, r] es059261 denegar_es059261_es.Argentina.wav
181725 es es.Mexico interlocutor [ɪ, n, t̪, e, r, l, o, k, ʊ, t̪, o, ɾ] es112375 interlocutor_es112375_es.Mexico.wav
175381 es es.Mexico informar [ɪ, n, f, o, r, m, aː] es109420 informar_es109420_es.Mexico.wav
205125 es es.Castellano reconfortar [r, e, k, o, m, f, ɔ, ɾ, t̪, a] es164042 reconfortar_es164042_es.Castellano.wav
207569 es es.Castellano amasar [a, m, a, s, a] es010955 amasar_es010955_es.Castellano.wav

EDA¶

In [7]:
print(f"es unique words: {df_phones[df_phones['lang'] == 'es']['word'].nunique()}")
print(f"en unique words: {df_phones[df_phones['lang'] == 'en']['word'].nunique()}")
es unique words: 18795
en unique words: 18422
In [8]:
# Combinations of accents:
pd.set_option('display.max_colwidth', None)  # Show full content of each cell

df_tmp = (
    df_phones
    .groupby(['lang', 'word'])
    .agg({'accent': lambda x: ", ".join(sorted(list(set(x))))})
    .reset_index()
    .groupby(['lang', 'accent'], as_index=False)
    .size()
    .sort_values(by=['lang', 'size'], ascending=[True, False])
)
# if freq < X, accent="others":
df_tmp.loc[df_tmp['size'] < 300, 'accent'] = 'otros'
df_tmp = (
    df_tmp
    .groupby(['lang', 'accent'])
    .agg({'size': 'sum'})
    .reset_index()
    .sort_values(by=['lang', 'size'], ascending=[True, False])
)
df_tmp
Out[8]:
lang accent size
3 en en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.uk.rp, en.us.south, en.us.us 7819
1 en en.Australian, en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.uk.rp, en.us.south, en.us.us 7437
4 en en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.uk.rp, en.us.us 1025
2 en en.Australian, en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.uk.rp, en.us.us 936
0 en en.Australian, en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.uk.rp, en.us.south 532
5 en en.Irish, en.Jamaica, en.scot, en.uk.Yorkshire, en.uk.general, en.us.south, en.us.us 346
6 en otros 327
7 es es.Argentina, es.Castellano, es.Mexico 18775
8 es otros 20
In [9]:
import plotnine as p9

df_plot = (
    df_phones
    .groupby(['lang', 'accent'])
    .agg({'word': 'nunique'})
    .reset_index()
    .rename(columns={'word': 'unique_words'})
)
p = (
    p9.ggplot(df_plot, p9.aes(x='accent', y='unique_words'))
    + p9.geom_col(fill="lightblue", color="black", alpha=.7)
    + p9.theme_bw()
    + p9.facet_wrap('~lang', scales='free_x')
    + p9.theme(
        figure_size=(6, 3),
        axis_text_x=p9.element_text(rotation=45, hjust=1),
        legend_position='bottom',
    )
    + p9.labs(x="Acento", y="Palabras únicas")
)
p.show()
No description has been provided for this image

Phone distances¶

In [10]:
# import Levenshtein
from itertools import combinations
# from nltk.metrics.aline import delta, align
from IPython.display import Audio, display, Markdown

def play_audio(df, word, lang, accents: list = None):
    """Play audio for a given word and accent."""
    audio_path = f"output/words_wav/{lang}/sr16000"
    df_filtered = df[(df["word"] == word) & (df["lang"] == lang)]
    if accents is not None:
        df_filtered = df_filtered[df_filtered["accent"].isin(accents)]
    if df_filtered.empty:
        print(f"No audio found for word '{word}' in accents {accents}.")
    else:
        for _, row in df_filtered.iterrows():
            audio_file = f"{audio_path}/{row['wav_file']}"
            word = row["word"]
            phones_str = "".join(row["phone_list"])
            accent = row["accent"]
            display(Markdown(f"**{word}** ({accent}): [{phones_str}]"))
            # Play audio
            display(Audio(audio_file, autoplay=False))            

def my_levenshtein(a, b):
    """Because the original one fails with IPA characters"""
    n, m = len(a), len(b)
    # initialize distance matrix
    dp = [[0] * (m + 1) for _ in range(n + 1)]
    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j
    # fill matrix
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if a[i - 1] == b[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost  # substitution
            )
    return dp[n][m]

def make_distances_df(df, lang='en'):
    # Create df with cols: word, accents (N cols), phones:
    df_words = (
        df
        .query(f"lang == '{lang}'")
        .pivot_table(
            index=["word", "id"],
            columns="accent",
            values="phone_list",
            aggfunc="first",
        )
        .reset_index()
    )
    # df_words.query("word == 'usa'")
    accent_cols = [col for col in df_words.columns if col not in ['word', 'id']]
    results = []
    for _, row in df_words.iterrows():
        word = row['word']
        wordid = row['id']
        for acc1, acc2 in combinations(accent_cols, 2):
            phones1 = row[acc1]
            phones2 = row[acc2]
            if not (isinstance(phones1, list) and isinstance(phones2, list)):
                l_dist = k_dist = None
            elif len(phones1) == 0 or len(phones2) == 0:
                l_dist = k_dist = None
            else:
                # phone_str_1 = "".join(phones1)
                # phone_str_2 = "".join(phones2)
                # l_dist = Levenshtein.distance(phone_str_1, phone_str_2)
                l_dist = my_levenshtein(phones1, phones2)
                # k_dist = delta(phones1, phones2)  # TODO this fails!
            results.append({
                'word': word,
                'wordid': wordid,
                'accent1': acc1,
                'accent2': acc2,
                'phones1': phones1,
                'phones2': phones2,
                'l_dist': l_dist,
                # 'kondrak_distance': k_dist, # TODO this fails!
                # 'n_phones': TODO, # using phone_list
            })
            # if word == "zucchini":
            # # if wordid == "en100165-55":
            #     print(f"Word: {word}, Accents: {acc1}, {acc2}, Phones: {row[acc1]}, {row[acc2]}, Levenshtein: {l_dist}")
    df_dist = pd.DataFrame(results)
    return df_dist
In [11]:
df_dist_en = make_distances_df(df_phones, lang='en')
df_dist_en["lang"] = 'en'
df_dist_es = make_distances_df(df_phones, lang='es')
df_dist_es["lang"] = 'es'

df_dist = pd.concat([df_dist_en, df_dist_es], ignore_index=True)
In [12]:
df_dist_en.sort_values(by='l_dist', ascending=False)
Out[12]:
word wordid accent1 accent2 phones1 phones2 l_dist lang
429410 faq en1032377 en.Australian en.scot [e, f, æʌ̯, k̟, iː] [f, ɹ, i, k, w, ə, n, d, l̪, ɨ, ø̞, s, k, w, ɛ, s, t̠ʃ, ɨ, n] 19.0 en
429431 faq en1032377 en.scot en.uk.rp [f, ɹ, i, k, w, ə, n, d, l̪, ɨ, ø̞, s, k, w, ɛ, s, t̠ʃ, ɨ, n] [ɛu, θ, e, k̟, j, uː] 19.0 en
429423 faq en1032377 en.Jamaica en.scot [ɛ, f, e, k̟, j, uː] [f, ɹ, i, k, w, ə, n, d, l̪, ɨ, ø̞, s, k, w, ɛ, s, t̠ʃ, ɨ, n] 19.0 en
429432 faq en1032377 en.scot en.us.south [f, ɹ, i, k, w, ə, n, d, l̪, ɨ, ø̞, s, k, w, ɛ, s, t̠ʃ, ɨ, n] [ɛ, f, æʌ̯, i, k̟, iu̜, ʊ] 18.0 en
429430 faq en1032377 en.scot en.uk.general [f, ɹ, i, k, w, ə, n, d, l̪, ɨ, ø̞, s, k, w, ɛ, s, t̠ʃ, ɨ, n] [ɛu, f, æʌ̯, k, uː] 18.0 en
... ... ... ... ... ... ... ... ...
1326379 zucchini en100165-55 en.uk.general en.us.south NaN NaN NaN en
1326380 zucchini en100165-55 en.uk.general en.us.us NaN NaN NaN en
1326381 zucchini en100165-55 en.uk.rp en.us.south NaN NaN NaN en
1326382 zucchini en100165-55 en.uk.rp en.us.us NaN NaN NaN en
1326383 zucchini en100165-55 en.us.south en.us.us NaN NaN NaN en

1326384 rows × 8 columns

In [13]:
df_dist_en.query("l_dist == 2").sample(2)
Out[13]:
word wordid accent1 accent2 phones1 phones2 l_dist lang
1232022 underachieve en1093310 en.uk.general en.uk.rp [ʌ, n, d, ɚː, ɹ, ɛu, t̠ʃ, ɨ, v] [ʌ, n, d̪, ɵ, ɹ, ɛu, t̠ʃ, ɨ, v] 2.0 en
268448 credence en1020523 en.uk.general en.us.us [k, ɻ, i, d, ɨ, n, z] [k, ɻ, i, d, ə, n, ts] 2.0 en
In [14]:
# Add relative distance:
df_dist_clean = df_dist.dropna(subset=['l_dist']).copy()
df_dist_clean["max_len"] = df_dist_clean.apply(lambda x: max(len(x['phones1']), len(x['phones2'])), axis=1)
df_dist_clean["l_dist_norm"] = df_dist_clean["l_dist"] / df_dist_clean["max_len"]
In [15]:
df_tmp = df_dist_clean.query("lang == 'es'").query("l_dist_norm > .5")
df_tmp.sample(20)
Out[15]:
word wordid accent1 accent2 phones1 phones2 l_dist lang max_len l_dist_norm
1366106 pareja es143329 es.Castellano es.Mexico [p, ə, ɾ, i, x, a̟] [p, a, r, e, x, a] 4.0 es 6 0.666667
1375712 sheriff es177727 es.Castellano es.Mexico [ʂ, i, ɛu, ɗ, iː, f] [ʃ, ɛ, r, iː, f] 4.0 es 6 0.666667
1380907 vaya es197027 es.Argentina es.Mexico [v, ɑ, zʲ, a] [ɓ, a, ɟʝ, a] 3.0 es 4 0.750000
1380050 tumbar es194173 es.Castellano es.Mexico [ʊ̃, m, β, ɑ] [ʊ, m, b, a] 3.0 es 4 0.750000
1343604 desdén es061965 es.Argentina es.Castellano [ð, e, v, ɛ, n] [d̪, ə, s, t̪, e, n] 5.0 es 6 0.833333
1382560 ánimo es012975 es.Argentina es.Mexico [a̟, n, i, m, õ] [a, n, ɪ, m, o] 3.0 es 5 0.600000
1368286 playa es151222 es.Argentina es.Mexico [l, a̟, ʒʲ, a̟] [l, a, ɟʝ, a] 3.0 es 4 0.750000
1340858 correa es052027 es.Castellano es.Mexico [o, r, i, a] [k, ɔ, r, e, a] 3.0 es 5 0.600000
1326952 abrigar es001470 es.Argentina es.Mexico [a, β, ɾ, ɪ, ɣ, a, r] [a̟, b, r, i, ɡ, a] 6.0 es 7 0.857143
1360071 lumbar es121927 es.Argentina es.Castellano [l, ʊ̃, m, ɓ, a] [l, ʊ, m, b, æ] 3.0 es 5 0.600000
1367797 pieza es149403 es.Argentina es.Mexico [j, ɛ, s̪̻, a] [p, i, e, s̪̻, a] 3.0 es 5 0.600000
1368999 postín es153819 es.Argentina es.Castellano [β, o, θ, t̪, i, n] [ɔ, s, t̪, ɪ, n] 4.0 es 6 0.666667
1364760 ofensivo es138231 es.Argentina es.Castellano [o, f, e, n, s̪̻, ɪ, β, o] [ɔ, f, ɨ, n, s, iː, β, ɵ] 5.0 es 8 0.625000
1345562 diálogo es066128 es.Castellano es.Mexico [ð, ɪ, a̟, l, o, k] [ð, i, a, l, o, ɣ, o] 4.0 es 7 0.571429
1374053 rodar es171827 es.Castellano es.Mexico [ɾ, r, o, d̪, æ] [r, o, ð, a] 3.0 es 5 0.600000
1345974 duro es070632 es.Argentina es.Castellano [d̪, ʊ, ɾ, oː] [d̪, uː, ɗ, o] 3.0 es 4 0.750000
1345259 disgusto es068139 es.Castellano es.Mexico [ð, ɪ, s̪̻, ɣ, ʊ, s, t̪, o] [ð, ɨ, z̪, ɡ, u, s, t, o] 5.0 es 8 0.625000
1329673 amo es011481 es.Argentina es.Mexico [a̟, m, õ] [a, m, o] 2.0 es 3 0.666667
1358243 jersey es115041 es.Castellano es.Mexico [ʔ, ɛ, d, au, s, æʌ̯] [d̠ʒ, e̞, r, s, ɪ] 5.0 es 6 0.833333
1344056 desnudez es063730 es.Castellano es.Mexico [d̪, ə, s, n, ʊ, ð, æʌ̯] [ð, e, z, n, uː, ð, e, s] 6.0 es 8 0.750000
In [16]:
play_audio(df_phones, "chusma", "es")

chusma (es.Castellano): [t̠ʃʊsma]

Your browser does not support the audio element.

chusma (es.Argentina): [t̠ʃʲʊme̞]

Your browser does not support the audio element.

chusma (es.Mexico): [t̠ʃuːz̪me̞]

Your browser does not support the audio element.
In [17]:
df_tmp = df_dist_clean.query("lang == 'en'").query("l_dist_norm > .5")
df_tmp.sample(10)
Out[17]:
word wordid accent1 accent2 phones1 phones2 l_dist lang max_len l_dist_norm
492582 general en037980 en.uk.general en.uk.rp [ʃ, æʌ̯, n, ɚː, ɹ, ai̯] [d̠ʒ, æʌ̯, n, r̪, o] 4.0 en 6 0.666667
636193 jaguar en1048722 en.Australian en.Jamaica [d̠ʒ, æ, ɡ, i, w, ə] [d̠ʒ, a̟, ɡ, o] 4.0 en 6 0.666667
640451 jingle en1049035 en.Irish en.uk.general [d̠ʒ, ɪ, ŋ, ɠ] [t̠ʃ, ɪ, ŋ, ɡ, ai̯] 3.0 en 5 0.600000
747466 monumental en1057441 en.uk.rp en.us.us [m, o, nʲ, y, m, e, n, t, o] [m, a, n, j, ɵ, m, ɛ, n, t, æʌ̯] 6.0 en 10 0.600000
152313 bumblebee en1011821 en.uk.rp en.us.south [e̞, m, ɓ, aɪ, b, i] [β, ɐ̃, m, ɓ, ɓ, e, j] 5.0 en 7 0.714286
573201 humdrum en1043095 en.Irish en.scot [h, o̞, m, d, r, o̞, m] [ɒ, m, d̠ʒ, o, m] 5.0 en 7 0.714286
435347 feel en033379 en.us.south en.us.us [θ, e, o] [f, ɨ, l] 3.0 en 3 1.000000
326761 dingy en025134 en.scot en.us.us [ɜ̃, ŋ, ʒʲ, ɪ] [d, ɛ, ŋ, d̠ʒ, i] 4.0 en 5 0.800000
869504 pond en065254 en.uk.general en.us.us [a, n, d] [p, ɑ, n, ɗ] 3.0 en 4 0.750000
1283994 warbler en1097795 en.Jamaica en.uk.rp [w, o̞, b, l, ɚː] [b, u, b, l, ai̯] 3.0 en 5 0.600000
In [18]:
play_audio(df_phones, "funky", "en", accents=["en.us.south", "en.Jamaica"])

funky (en.us.south): [fæ̃ŋkɪ]

Your browser does not support the audio element.

funky (en.Jamaica): [fõːŋk̟i]

Your browser does not support the audio element.
In [19]:
play_audio(df_phones, "anxious", "en", accents=["en.Jamaica", "en.us.us"])

anxious (en.Jamaica): [a̟ŋʃo̞s̪̻]

Your browser does not support the audio element.

anxious (en.us.us): [eŋt̠ʃɚːs]

Your browser does not support the audio element.
In [20]:
play_audio(df_phones, "dot", "en")

dot (en.Jamaica): [dɔʀt]

Your browser does not support the audio element.

dot (en.uk.general): [ɑːt]

Your browser does not support the audio element.

dot (en.Irish): [aːr̝]

Your browser does not support the audio element.

dot (en.us.us): [ɟʝɑt]

Your browser does not support the audio element.

dot (en.uk.rp): [ot]

Your browser does not support the audio element.

dot (en.scot): [do̞kt̪]

Your browser does not support the audio element.

dot (en.uk.Yorkshire): [o̞t]

Your browser does not support the audio element.

dot (en.us.south): [d̪ɑːr]

Your browser does not support the audio element.
In [21]:
# Avg rel distance:
avg_rel_dist = df_dist_clean.groupby(["lang", "accent1", "accent2"])["l_dist_norm"].mean().reset_index()
avg_rel_dist = avg_rel_dist.sort_values(by="l_dist_norm", ascending=False)
avg_rel_dist.head(10)
Out[21]:
lang accent1 accent2 l_dist_norm
20 en en.scot en.us.south 0.527315
2 en en.Australian en.scot 0.513059
23 en en.uk.general en.us.south 0.488045
18 en en.scot en.uk.general 0.483563
5 en en.Australian en.us.south 0.480799
25 en en.uk.rp en.us.south 0.475033
1 en en.Australian en.Jamaica 0.474636
21 en en.scot en.us.us 0.470016
19 en en.scot en.uk.rp 0.469233
16 en en.Jamaica en.us.south 0.469062
In [22]:
# Make a heatmap of the average relative distance:
import plotnine as p9

def heatmap(df, lang):
    df_plot = df.query(f"lang == '{lang}'").copy()
    df_plot["text"] = df_plot["l_dist_norm"].apply(lambda x: f"{x:.2f}")
    p = (
        p9.ggplot(df_plot, p9.aes(x='accent1', y='accent2', fill='l_dist_norm'))
        + p9.geom_tile(color='white')
        + p9.geom_text(p9.aes(label='text'), size=10, color='white')
        + p9.scale_fill_gradient(name='Avg. Levensh.\nnorm.', low='lightblue', high='darkblue')
        + p9.theme_bw()
        + p9.theme(
            figure_size=(6, 4),
            legend_position='right',
            axis_text_x=p9.element_text(rotation=45, hjust=1, size=10),
            axis_text_y=p9.element_text(size=10),
        )
        + p9.labs(x="", y="")
    )
    p.show()

heatmap(avg_rel_dist, lang='en')
heatmap(avg_rel_dist, lang='es')
No description has been provided for this image
No description has been provided for this image

Logistic¶

In [23]:
import numpy as np
import pandas as pd
import plotnine as p9
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display, Markdown, Audio


def contains_subsequence(data, pattern):
    m, n = len(data), len(pattern)
    if n == 0:
        return True   # empty pattern “always” matches
    if n > m:
        return False  # pattern longer than data can’t match
    # slide a window of length n over data
    for i in range(m - n + 1):
        if data[i:i+n] == pattern:
            return True
    return False

def coef_plot(df_coefs, stats_per_class, top_n=5, ncol=1):
    df_plot = df_coefs.groupby("accent").head(top_n).reset_index(drop=True).copy()
    # Make a label for each accent as: "accent\n(recall, rel_freq)"
    df_plot["accent_label"] = df_plot["accent"].apply(
        lambda x: f"{x}\n(recall: {stats_per_class[x][0]:.2f}, freq: {stats_per_class[x][1]:.2f})"
    )
    nrow = int(np.ceil(len(df_plot["accent"].unique()) / ncol))
    p = (
        p9.ggplot(
            df_plot,
            p9.aes(x="coef", y="reorder(phone+'::'+accent_label, coef)")
        )
        + p9.geom_segment(
            p9.aes(xend=0, yend="reorder(phone+'::'+accent_label, coef)"),
            color="navy", size=1, alpha=0.5
        )
        + p9.geom_point(size=1, color="navy")
        + p9.facet_wrap("accent_label", scales="free_y", ncol=ncol)
        + p9.scale_y_discrete(labels=lambda lst: [l.split("::")[0] for l in lst])
        + p9.theme_bw()
        + p9.theme(
            figure_size=(2 * ncol, 1.5 * nrow),
            axis_text_x=p9.element_text(rotation=45, hjust=1),
            axis_text_y=p9.element_text(size=10),
        )
        + p9.labs(
            y="", x="",
        )
        
    )
    return df_plot, p

def most_distinctive_phones(df, ngram_range=(1,1), min_freq=10):
    """Return a df of logistic regression coefficients for each phone
    for each accent, using n-grams of size ngram_order.
    Also return dict accent: (recall, rel_freq).
    """
    vectorizer = CountVectorizer(
        lowercase=False,
        analyzer='word',
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        ngram_range=ngram_range,
        min_df=min_freq,     # Minimum document frequency
        binary=True,  # Binary counts (presence/absence of n-grams)
    )

    X_vectorized = vectorizer.fit_transform(df["phone_list"].values)
    y = df["accent"].values
    model = LogisticRegression(max_iter=1000, random_state=33)
    model.fit(X_vectorized, y)
    coefs = model.coef_ 
    # print accuracy:
    maj_class_freq = pd.Series(y).value_counts(normalize=True).max()
    print(f"Accuracy: {model.score(X_vectorized, y):.3f} (maj class freq: {maj_class_freq:.3f})")
    # If binary, add first line with negative coefficients:
    if len(model.classes_) == 2:
        coefs = np.vstack([-coefs[0], coefs])
    df_coefs = pd.DataFrame(
        coefs.T, 
        index=vectorizer.get_feature_names_out(),
        columns=model.classes_,
    ).reset_index().rename(columns={"index": "phone"})
    # To long format:
    df_coefs_long = df_coefs.melt(id_vars=["phone"], var_name="accent", value_name="coef")
    df_coefs_long = df_coefs_long.sort_values(by=["accent", "coef"], ascending=[True, False])
    # Make dict accent: (recall, rel_freq)
    pred = model.predict(X_vectorized)
    stats_per_class = {}
    for cl in model.classes_:
        recall = np.mean(pred[y == cl] == cl)
        rel_freq = np.mean(y == cl)
        stats_per_class[cl] = (recall, rel_freq)
    return df_coefs_long, stats_per_class

def show_examples(df, phone_ngram, lang, accent, wavs_path="output/words_wav/en/sr16000", seed=33, n=3):
    """For a given phone ngram (list of phones in order) and an accent,
    show examples of words and their audios.
    """
    df_accent = df[(df["accent"] == accent) & (df["lang"] == lang)].copy()
    df_filtered = df_accent[df_accent["phone_list"].apply(lambda x: contains_subsequence(x, phone_ngram))]
    if df_filtered.empty:
        print(f"No examples found for phone ngram {phone_ngram} in accent {accent}.")
    else:
        df_filtered = df_filtered.sample(n=min(n, len(df_filtered)), random_state=seed)
        for _, row in df_filtered.iterrows():
            word = row["word"]
            phones_str = "".join(row["phone_list"])
            audio_file = f"{wavs_path}/{row['wav_file']}"
            display(Markdown(f"{phone_ngram} as in **{word}** ({accent}): [{phones_str}]"))
            display(Audio(audio_file, autoplay=False))

def run_everything(df, ngram_range=(1,1), lang="es", accents=None, seed=33, ncol=1):
    audio_path = f"output/words_wav/{lang}/sr16000"
    df_filtered = df[df["lang"] == lang].copy()
    if accents is None:
        accents = df_filtered["accent"].unique().tolist()
    df_filtered = df_filtered[df_filtered["accent"].isin(accents)].copy()
    df_coefs, stats_per_class = most_distinctive_phones(df_filtered, ngram_range=ngram_range, min_freq=10)
    # # vec, mod = most_distinctive_phones(df_filtered, ngram_order=ngram_order)
    # return vec, mod
    df_plot, p = coef_plot(df_coefs, stats_per_class, top_n=5, ncol=ncol)
    p.show()
    for _, row in df_plot.iterrows():
        phone_ngram = row["phone"].split(" ")
        accent = row["accent"]
        # print(row)
        # print()
        show_examples(df, phone_ngram, lang, accent, wavs_path=audio_path, seed=seed, n=1)

Español¶

In [24]:
run_everything(df_phones, ngram_range=(1, 1), lang="es", ncol=3)
Accuracy: 0.481 (maj class freq: 0.333)
No description has been provided for this image

['ʒ'] as in anillo (es.Argentina): [a̟nɪʒõː]

Your browser does not support the audio element.

['zʲ'] as in enrollar (es.Argentina): [ənrɔzʲɑ]

Your browser does not support the audio element.

['ʑ'] as in alcantarillado (es.Argentina): [alkant̪aɾiʑaðo]

Your browser does not support the audio element.

['ʒʲ'] as in embotelladora (es.Argentina): [əmβɔt̪əʒʲaðoɾa]

Your browser does not support the audio element.

['sʲ'] as in cajeta (es.Argentina): [asʲit̪a]

Your browser does not support the audio element.

['t̪s̪ː'] as in berberecho (es.Castellano): [berberet̪s̪ːo]

Your browser does not support the audio element.

['θ'] as in incentivar (es.Castellano): [ɪnθent̪ɪβaɾ]

Your browser does not support the audio element.

['tsʲ'] as in quitamanchas (es.Castellano): [ɪt̪ame̞ntsʲes̪̻]

Your browser does not support the audio element.

['ts'] as in mapache (es.Castellano): [ma̟patse]

Your browser does not support the audio element.

['ɟʝ'] as in yogur (es.Castellano): [ɟʝoːkʊ]

Your browser does not support the audio element.

['d̠ʒ'] as in conllevar (es.Mexico): [kond̠ʒeva]

Your browser does not support the audio element.

['ɟʝ'] as in husillo (es.Mexico): [ʊsɪɟʝo]

Your browser does not support the audio element.

['ɛu̜'] as in empeorar (es.Mexico): [empɛu̜oɾa]

Your browser does not support the audio element.

['z̪'] as in luctuoso (es.Mexico): [lʊkt̪ʊa̟z̪o]

Your browser does not support the audio element.

['o̞ː'] as in hoya (es.Mexico): [o̞ːɟʝə]

Your browser does not support the audio element.
In [25]:
run_everything(df_phones, ngram_range=(2, 2), lang="es", ncol=3)
Accuracy: 0.540 (maj class freq: 0.333)
No description has been provided for this image

['ʒʲ', 'o'] as in colmillo (es.Argentina): [kolmɪʒʲo]

Your browser does not support the audio element.

['x', 't̪'] as in persistir (es.Argentina): [æʌ̯sɪxt̪iː]

Your browser does not support the audio element.

['v', 'l'] as in venerable (es.Argentina): [venɪd̪avle]

Your browser does not support the audio element.

['zʲ', 'a'] as in cayado (es.Argentina): [azʲaðo]

Your browser does not support the audio element.

['ʒ', 'o'] as in pocillo (es.Argentina): [os̪̻s̪̻ɪʒo]

Your browser does not support the audio element.

['p', 'θ'] as in erupción (es.Castellano): [eðʊpθɪon]

Your browser does not support the audio element.

['θ', 'ɛ'] as in estremecer (es.Castellano): [ɛst̪ɾəmeθɛ]

Your browser does not support the audio element.

['θ', 'o'] as in escorzo (es.Castellano): [eskoɾθo]

Your browser does not support the audio element.

['ð', 'oː'] as in sólido (es.Castellano): [solɪðoː]

Your browser does not support the audio element.

['n', 't̪s̪ː'] as in balanza (es.Castellano): [balant̪s̪ːa]

Your browser does not support the audio element.

['oː', 'ɾ'] as in orín (es.Mexico): [oːɾɪn]

Your browser does not support the audio element.

['æʌ̯', 'a'] as in gimotear (es.Mexico): [hɨmo̞t̪æʌ̯a]

Your browser does not support the audio element.

['ɛ', 'v'] as in nueva (es.Mexico): [nʊɛva]

Your browser does not support the audio element.

['ɑ', 'v'] as in puñalada (es.Mexico): [ʊɲalɑva]

Your browser does not support the audio element.

['oː', 'n'] as in tronar (es.Mexico): [roːna]

Your browser does not support the audio element.
In [26]:
# run_everything(df_phones, ngram_range=(3, 3), lang="es", ncol=3)

English¶

In [27]:
run_everything(df_phones, ngram_range=(1, 2), lang="en", ncol=3)
# NOTE if binary, the coeff for the negative class are the negative of the overall coeffs.
# So the left plot displays the largest negative coefficients in absolute value.
Accuracy: 0.333 (maj class freq: 0.120)
No description has been provided for this image

['ai̯', 'ʃ'] as in coordination (en.Australian): [uːɔːdɪnai̯ʃən]

Your browser does not support the audio element.

['ai̯', 'ʂ'] as in inauguration (en.Australian): [ɪno̞ɡjʊɹai̯ʂɨn]

Your browser does not support the audio element.

['ɪ', 'o̞'] as in periodic (en.Australian): [pir̪ɪo̞dik]

Your browser does not support the audio element.

['t', 'e̯'] as in affectation (en.Australian): [ɛufɛːkte̯ʃən]

Your browser does not support the audio element.

['p', 'i̞'] as in happenstance (en.Australian): [hø̞pi̞nste̞ndz]

Your browser does not support the audio element.

['r̝'] as in cannot (en.Irish): [ke̞nɑːr̝]

Your browser does not support the audio element.

['eː', 'r̪'] as in sparing (en.Irish): [speːr̪ɪŋ]

Your browser does not support the audio element.

['ɛ', 'd̪'] as in withers (en.Irish): [wɛd̪ɚːz]

Your browser does not support the audio element.

['p', 'ʀ'] as in push (en.Irish): [pʀɔʃ]

Your browser does not support the audio element.

['oː', 'ɾ'] as in curettage (en.Irish): [kioːɾe̞taɪʒ]

Your browser does not support the audio element.

['ʃ', 'ʌ'] as in trustworthy (en.Jamaica): [ʃʌstwe̞ðɨ]

Your browser does not support the audio element.

['ʒ', 'ʌ'] as in circumcision (en.Jamaica): [səkʌmsiʒʌn]

Your browser does not support the audio element.

['ʂ', 'ʌ'] as in exemption (en.Jamaica): [ɛuɡz̪ɛmpʂʌn]

Your browser does not support the audio element.

['t̪ː'] as in split (en.Jamaica): [s̪̻plæʌ̯t̪ː]

Your browser does not support the audio element.

['ɓ', 'e'] as in barely (en.Jamaica): [ɓelɨ]

Your browser does not support the audio element.

['r', 'd̪'] as in girdle (en.scot): [ɡ̟ærd̪o]

Your browser does not support the audio element.

['ʀ', 's'] as in coercive (en.scot): [k̟wɛʀsœv]

Your browser does not support the audio element.

['ɚː', 'ɭ'] as in wanderer (en.scot): [wʌnd̪ɚːɭɚː]

Your browser does not support the audio element.

['ɾ'] as in silvery (en.scot): [seu̯ʊ̃ŋvɾe]

Your browser does not support the audio element.

['d', 'ð'] as in indecisive (en.scot): [e̞ndðəsæʌ̯sɪv]

Your browser does not support the audio element.

['t', 'ð'] as in advocate (en.uk.Yorkshire): [naʊnɛudðɵko̞tðəβɛudvɵke̯t]

Your browser does not support the audio element.

['d', 'ð'] as in advocate (en.uk.Yorkshire): [naʊnɛudðɵko̞tðəβɛudvɵke̯t]

Your browser does not support the audio element.

['ʂ', 'n'] as in rationalize (en.uk.Yorkshire): [ɹe̞ʂnæʌ̯z]

Your browser does not support the audio element.

['ɑː', 'ð'] as in farthest (en.uk.Yorkshire): [fɑːðɚːst]

Your browser does not support the audio element.

['uː', 'd̪'] as in poodle (en.uk.Yorkshire): [uːd̪õ]

Your browser does not support the audio element.

['ts', 'ɹ'] as in intrepid (en.uk.general): [ɪntsɹɛpɨd]

Your browser does not support the audio element.

['t̪', 'aɪ'] as in metal (en.uk.general): [mɛt̪aɪ]

Your browser does not support the audio element.

['t̠ʃ', 'j'] as in amateur (en.uk.general): [a̟me̞t̠ʃja̟]

Your browser does not support the audio element.

['õː', 'p'] as in topical (en.uk.general): [t̠ʃõːpikɔː]

Your browser does not support the audio element.

['ŋ', 'ʒʲ'] as in injured (en.uk.general): [ɪŋʒʲet]

Your browser does not support the audio element.

['bʱ'] as in perished (en.uk.rp): [bʱe̞ɻɛʃd]

Your browser does not support the audio element.

['ɚː', 'r̪'] as in literate (en.uk.rp): [leɭɚːr̪æʌ̯t]

Your browser does not support the audio element.

['tʰ'] as in tasting (en.uk.rp): [tʰæʌ̯stɪŋ]

Your browser does not support the audio element.

['ɵ', 'ɛ'] as in whatsoever (en.uk.rp): [wʌtsɵɛvɚː]

Your browser does not support the audio element.

['l', 'yː'] as in salute (en.uk.rp): [s̪̻elyːʈ]

Your browser does not support the audio element.

['ɨ', 'ɛ'] as in involve (en.us.south): [ɦɨɛnvɒld]

Your browser does not support the audio element.

['aː', 'z'] as in sympathize (en.us.south): [sempəθaːz]

Your browser does not support the audio element.

['ai̯', 'ɛu'] as in photograph (en.us.south): [fodeu̯ɡɹai̯ɛuf]

Your browser does not support the audio element.

['ai̯', 'ʃ'] as in exhilaration (en.us.south): [ɛuɡzæʌ̯lɚːɹai̯ʃɨn]

Your browser does not support the audio element.

['ɨ', 'ɛu'] as in insurer (en.us.south): [ɨɛunʃœːɚː]

Your browser does not support the audio element.

['ɒ', 'r'] as in parka (en.us.us): [pɒrkə]

Your browser does not support the audio element.

['n', 'tʰ'] as in untapped (en.us.us): [ɔntʰæpd]

Your browser does not support the audio element.

['ɻ', 'u'] as in scrutinize (en.us.us): [skɻutnæʌ̯z]

Your browser does not support the audio element.

['ɦ', 'ɨ'] as in elusive (en.us.us): [ɦɨluːsɪð]

Your browser does not support the audio element.

['l̪', 'ai̯'] as in latest (en.us.us): [l̪ai̯dɪst]

Your browser does not support the audio element.
In [28]:
run_everything(df_phones, ngram_range=(1, 1), lang="en", ncol=3)
# NOTE if binary, the coeff for the negative class are the negative of the overall coeffs.
# So the left plot displays the largest negative coefficients in absolute value.
Accuracy: 0.229 (maj class freq: 0.120)
No description has been provided for this image

['r̝'] as in kinesthetic (en.Australian): [iːneu̯st̪ɛr̝ek]

Your browser does not support the audio element.

['kʰ'] as in condescension (en.Australian): [kʰo̞ndɪsɛnʂɨn]

Your browser does not support the audio element.

['ɐ̃i'] as in snowdrop (en.Australian): [snɐ̃idɻɑːp]

Your browser does not support the audio element.

['ɾ'] as in formidable (en.Australian): [fɔwmeɾavol]

Your browser does not support the audio element.

['pʰ'] as in poncho (en.Australian): [pʰɒnt̠ʃʲæʌ̯r]

Your browser does not support the audio element.

['r̝'] as in cannot (en.Irish): [ke̞nɑːr̝]

Your browser does not support the audio element.

['d̪z̪'] as in flimsy (en.Irish): [flɛmd̪z̪i]

Your browser does not support the audio element.

['ɲ'] as in bologna (en.Irish): [əloɲa̟]

Your browser does not support the audio element.

['ʈʰ'] as in tantrum (en.Irish): [ʈʰæntzʌm]

Your browser does not support the audio element.

['t̪ʰ'] as in tornado (en.Irish): [t̪ʰo̞ːnai̯ɗɵ]

Your browser does not support the audio element.

['t̪ː'] as in split (en.Jamaica): [s̪̻plæʌ̯t̪ː]

Your browser does not support the audio element.

['ɔ̤'] as in cost (en.Jamaica): [kɔ̤st]

Your browser does not support the audio element.

['tː'] as in refit (en.Jamaica): [r̪iːfæʌ̯tː]

Your browser does not support the audio element.

['eː'] as in stale (en.Jamaica): [steːl]

Your browser does not support the audio element.

['t̪'] as in subterfuge (en.Jamaica): [sʌt̪ɚːfjuːd̠ʒ]

Your browser does not support the audio element.

['y'] as in brood (en.scot): [bryd̪]

Your browser does not support the audio element.

['ɾ'] as in silvery (en.scot): [seu̯ʊ̃ŋvɾe]

Your browser does not support the audio element.

['yː'] as in moody (en.scot): [myːdi]

Your browser does not support the audio element.

['ʉ̟'] as in to (en.scot): [t̪ʉ̟]

Your browser does not support the audio element.

['t̪'] as in theme (en.scot): [t̪im]

Your browser does not support the audio element.

['kʰ'] as in cook (en.uk.Yorkshire): [kʰo̞k]

Your browser does not support the audio element.

['tsʲ'] as in glitter (en.uk.Yorkshire): [litsʲa̟]

Your browser does not support the audio element.

['ə̆'] as in blessed (en.uk.Yorkshire): [naunblæʌ̯sə̆fəblæʌ̯st]

Your browser does not support the audio element.

['eu̜'] as in usable (en.uk.Yorkshire): [jeu̜ze̞ɓo]

Your browser does not support the audio element.

['ʈʰ'] as in tacit (en.uk.Yorkshire): [ʈʰe̞sid]

Your browser does not support the audio element.

['õː'] as in gallbladder (en.uk.general): [õːplætə]

Your browser does not support the audio element.

['ɕ'] as in dementia (en.uk.general): [ɨme̞nɕə]

Your browser does not support the audio element.

['ũː'] as in sword (en.uk.general): [smũːd]

Your browser does not support the audio element.

['œ̃'] as in outdoor (en.uk.general): [œ̃toʀ]

Your browser does not support the audio element.

['ãː'] as in tiara (en.uk.general): [ɪãːɻe̞]

Your browser does not support the audio element.

['bʱ'] as in perished (en.uk.rp): [bʱe̞ɻɛʃd]

Your browser does not support the audio element.

['tʰ'] as in tasting (en.uk.rp): [tʰæʌ̯stɪŋ]

Your browser does not support the audio element.

['ɐ̃i'] as in ira (en.uk.rp): [ɐ̃iɐ̃r̪æʌ̯j]

Your browser does not support the audio element.

['k̟ʰ'] as in keystone (en.uk.rp): [k̟ʰistaun]

Your browser does not support the audio element.

['kʰ'] as in keynote (en.uk.rp): [kʰɨnaut]

Your browser does not support the audio element.

['ɦ'] as in installation (en.us.south): [ɦiɪnstɛulai̯ʃɨn]

Your browser does not support the audio element.

['n̩'] as in impinge (en.us.south): [eu̯mpiːe̯n̩d̠ʒ]

Your browser does not support the audio element.

['ɾ'] as in modal (en.us.south): [moɾai̯]

Your browser does not support the audio element.

['ɓ'] as in embody (en.us.south): [emɓarɪ]

Your browser does not support the audio element.

['eː'] as in peck (en.us.south): [peːk]

Your browser does not support the audio element.

['ũː'] as in removal (en.us.us): [ʀɔmũːvo]

Your browser does not support the audio element.

['ĩː'] as in lewd (en.us.us): [lĩːŋudɨ]

Your browser does not support the audio element.

['ũ'] as in clue (en.us.us): [klũm]

Your browser does not support the audio element.

['õː'] as in pulp (en.us.us): [põːp]

Your browser does not support the audio element.

['ẽː'] as in flamenco (en.us.us): [flɵmẽːŋkau̜]

Your browser does not support the audio element.

Pairwise performance¶

In [29]:
def logistic_accuracy(df):
    vectorizer = CountVectorizer(
        lowercase=False,
        analyzer='word',
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        ngram_range=(1, 2),
        min_df=10,
        binary=True,
    )
    X_vectorized = vectorizer.fit_transform(df["phone_list"].values)
    y = df["accent"].values
    model = LogisticRegression(max_iter=1000, random_state=33)
    model.fit(X_vectorized, y)
    acc = model.score(X_vectorized, y)
    n_features = X_vectorized.shape[1]
    return acc, n_features

def pairwise_performance_df(df, lang='en', accents=None):
    df_filtered = df[df["lang"] == lang].copy()
    if accents is None:
        accents = df_filtered["accent"].unique().tolist()
    df_filtered = df_filtered[df_filtered["accent"].isin(accents)].copy()
    accent_pairs = combinations(accents, 2)
    results = []
    for acc1, acc2 in accent_pairs:
        # Keep only rows with the two and words in common:
        df_pair = df_filtered[(df_filtered["accent"] == acc1) | (df_filtered["accent"] == acc2)].copy()
        common_words = df_pair["word"].unique()
        df_pair = df_pair[df_pair["word"].isin(common_words)].copy()
        if df_pair.empty:
            print(f"No common words for accents {acc1} and {acc2}.")
            continue
        acc, n_features = logistic_accuracy(df_pair)
        maj_class_freq = df_pair["accent"].value_counts(normalize=True).max()
        n = len(df_pair)
        results.append({
            'accent1': acc1,
            'accent2': acc2,
            'accuracy': acc,
            'maj_class_freq': maj_class_freq,
            'n': n,
            'n_features': n_features,
        })
    df_results = pd.DataFrame(results)
    return df_results
In [30]:
df_acc_en = pairwise_performance_df(df_phones, lang='en')
df_acc_es = pairwise_performance_df(df_phones, lang='es')
In [31]:
df_acc_en["rel_acc"] = df_acc_en["accuracy"] / df_acc_en["maj_class_freq"]
df_acc_es["rel_acc"] = df_acc_es["accuracy"] / df_acc_es["maj_class_freq"]
In [32]:
def heatmap(df):
    df_plot = df.copy()
    # accents = sorted(df_plot["accent1"].unique())
    # df_plot["accent1"] = pd.Categorical(df_plot["accent1"]) #, categories=accents) #, ordered=True)
    # df_plot["accent2"] = pd.Categorical(df_plot["accent2"]) #, categories=accents) #, ordered=True)
    df_plot["text"] = df_plot.apply(lambda d: f"{d['rel_acc']:.2f}\n({d['accuracy']:.2f})", axis=1)
    # return df_plot
    p = (
        p9.ggplot(df_plot, p9.aes(x='accent2', y='accent1', fill='rel_acc'))
        + p9.geom_tile(color='white')
        + p9.geom_text(p9.aes(label='text'), size=9, color='white')
        + p9.scale_fill_gradient(name='XYZ', low='darkblue', high='lightblue')
        + p9.theme_bw()
        + p9.theme(
            figure_size=(6, 4),
            legend_position='right',
            axis_text_x=p9.element_text(rotation=45, hjust=1, size=10),
            axis_text_y=p9.element_text(size=10),
        )
        + p9.labs(x="", y="")
    )
    p.show()

heatmap(df_acc_en)
No description has been provided for this image
In [33]:
df_acc_en
Out[33]:
accent1 accent2 accuracy maj_class_freq n n_features rel_acc
0 en.Jamaica en.uk.Yorkshire 0.734667 0.500149 36833 2168 1.468896
1 en.Jamaica en.uk.rp 0.717520 0.504632 36484 2132 1.421867
2 en.Jamaica en.scot 0.729581 0.500081 36828 2277 1.458924
3 en.Jamaica en.us.us 0.711429 0.510707 36050 2134 1.393026
4 en.Jamaica en.uk.general 0.748832 0.500081 36828 2156 1.497421
5 en.Jamaica en.us.south 0.772727 0.528323 34848 2191 1.462604
6 en.Jamaica en.Irish 0.716356 0.500639 36775 2130 1.430884
7 en.Jamaica en.Australian 0.801696 0.673113 27352 1964 1.191027
8 en.uk.Yorkshire en.uk.rp 0.690012 0.504781 36495 2092 1.366953
9 en.uk.Yorkshire en.scot 0.717962 0.500068 36839 2260 1.435729
10 en.uk.Yorkshire en.us.us 0.745182 0.510857 36061 2126 1.458691
11 en.uk.Yorkshire en.uk.general 0.691197 0.500068 36839 2094 1.382206
12 en.uk.Yorkshire en.us.south 0.784991 0.528472 34859 2187 1.485398
13 en.uk.Yorkshire en.Irish 0.707470 0.500788 36786 2116 1.412713
14 en.uk.Yorkshire en.Australian 0.792822 0.673245 27363 1904 1.177614
15 en.uk.rp en.scot 0.720307 0.504714 36490 2264 1.427160
16 en.uk.rp en.us.us 0.694612 0.506076 35712 2088 1.372545
17 en.uk.rp en.uk.general 0.655248 0.504714 36490 2059 1.298257
18 en.uk.rp en.us.south 0.747899 0.523703 34510 2142 1.428097
19 en.uk.rp en.Irish 0.696133 0.503993 36437 2094 1.381235
20 en.uk.rp en.Australian 0.757459 0.669023 27014 1895 1.132186
21 en.scot en.us.us 0.746228 0.510789 36056 2262 1.460933
22 en.scot en.uk.general 0.740511 0.500000 36834 2265 1.481023
23 en.scot en.us.south 0.787514 0.528404 34854 2303 1.490362
24 en.scot en.Irish 0.729453 0.500720 36781 2240 1.456806
25 en.scot en.Australian 0.790226 0.673185 27358 2077 1.173861
26 en.us.us en.uk.general 0.719797 0.510789 36056 2101 1.409187
27 en.us.us en.us.south 0.715489 0.517637 34076 2114 1.382221
28 en.us.us en.Irish 0.696720 0.510069 36003 2078 1.365933
29 en.us.us en.Australian 0.762603 0.663619 26580 1907 1.149158
30 en.uk.general en.us.south 0.758765 0.528404 34854 2135 1.435956
31 en.uk.general en.Irish 0.701531 0.500720 36781 2094 1.401043
32 en.uk.general en.Australian 0.757475 0.673185 27358 1882 1.125210
33 en.us.south en.Irish 0.741674 0.527686 34801 2122 1.405522
34 en.us.south en.Australian 0.773229 0.647687 25378 1938 1.193831
35 en.Irish en.Australian 0.768541 0.672551 27305 1880 1.142725

Buscador de ejemplos¶

In [34]:
def play_audio(df, word, lang, accents: list = None):
    """Play audio for a given word and accent."""
    audio_path = f"output/words_wav/{lang}/sr16000"
    df_filtered = df[(df["word"] == word) & (df["lang"] == lang)]
    if accents is not None:
        df_filtered = df_filtered[df_filtered["accent"].isin(accents)]
    if df_filtered.empty:
        print(f"No audio found for word '{word}' in accents {accents}.")
    else:
        for _, row in df_filtered.iterrows():
            audio_file = f"{audio_path}/{row['wav_file']}"
            word = row["word"]
            phones_str = "".join(row["phone_list"])
            accent = row["accent"]
            display(Markdown(f"**{word}** ({accent}): [{phones_str}]"))
            # Play audio
            display(Audio(audio_file, autoplay=False)) 
In [35]:
play_audio(df_phones, "zucchini", "en", accents=["en.uk.general"])

zucchini (en.uk.general): [sək̟ine]

Your browser does not support the audio element.
In [36]:
from IPython.display import Audio, display, Markdown
from pathlib import Path

def play_audio(word, accent, path_prefix="output/words_wav/es/sr16000"):
    # find wav file containing word and accent:
    audio_files = list(Path(path_prefix).glob(f"{word}_*{accent}.wav"))
    if not audio_files:
        print(f"No audio files found for {word} with accent {accent}")
        return
    audio_path = audio_files[0]  # Take the first matching file
    try:
        display(Markdown(f"**{word}** ({accent})"))
        display(Audio(audio_path, autoplay=True))
    except FileNotFoundError:
        print(f"Audio file not found: {audio_path}")
In [37]:
play_audio("taberna", "es.Mexico")
# play_audio("caverna", "es.Mexico")
# play_audio("taberna", "es.Argentina")
# play_audio("caverna", "es.Argentina")

taberna (es.Mexico)

Your browser does not support the audio element.